df <- read.csv("peaks_data.csv")
head(df)
## Description...
## 1 Neuroblast differentiation-associated protein AHNAK OS=Homo sapiens OX=9606 GN=AHNAK PE=1 SV=2
## 2 Plectin OS=Homo sapiens OX=9606 GN=PLEC PE=1 SV=3
## 3 Myosin-9 OS=Homo sapiens OX=9606 GN=MYH9 PE=1 SV=4
## 4 Filamin-A OS=Homo sapiens OX=9606 GN=FLNA PE=1 SV=4
## 5 Myosin-10 OS=Homo sapiens OX=9606 GN=MYH10 PE=1 SV=3
## 6 Cytoplasmic dynein 1 heavy chain 1 OS=Homo sapiens OX=9606 GN=DYNC1H1 PE=1 SV=5
## Accession Gene_id BT_1 BT_2 BT_3 BT_4
## 1 Q09666|AHNK_HUMAN AHNAK 3208900000 2374900000 2.8808e+09 1909900000
## 2 Q15149|PLEC_HUMAN PLEC 1943300000 2154300000 1.8461e+09 2267400000
## 3 P35579|MYH9_HUMAN MYH9 4878900000 3330900000 1.0736e+10 6690100000
## 4 P21333|FLNA_HUMAN FLNA 3334200000 2846200000 5.6878e+09 6196000000
## 5 P35580|MYH10_HUMAN MYH10 2329900000 1185000000 4.3183e+09 3231700000
## 6 Q14204|DYHC1_HUMAN DYNC1H1 544400000 254110000 8.8306e+08 505250000
## BT_5 BT_6 BT_7 BT_8 BT_9 BT_10 BT_11
## 1 3236700000 3255400000 3441100000 2948000000 2548700000 1770600000 2411100000
## 2 1739200000 3055300000 793750000 1187500000 1402000000 1922900000 1803500000
## 3 5485700000 4331500000 6710200000 5387200000 5847800000 5327800000 6895400000
## 4 4257700000 4771100000 3792500000 4698200000 3319800000 5489500000 4068500000
## 5 2724200000 1637000000 3258100000 3862000000 3442200000 3314800000 2670100000
## 6 527220000 467710000 716170000 514400000 560290000 390410000 724970000
## BT_12 BT_13 BT_14 CJK_1 CJK_2 CJK_3 CJK_4
## 1 2038000000 3105200000 2409800000 2296400000 201810000 5.5206e+09 1276900000
## 2 2018600000 1165800000 1869100000 1098600000 260490000 1.5393e+09 831110000
## 3 3779000000 5840500000 5312000000 4444200000 767110000 7.0216e+09 1986500000
## 4 3320400000 3754300000 4735100000 6418900000 1338400000 1.1726e+10 3389000000
## 5 2527500000 2741200000 2451800000 1737500000 342550000 4.5725e+09 668480000
## 6 334890000 615510000 424040000 873530000 151260000 1.6666e+09 403560000
## CJK_5 CJK_6 CJK_7 CJK_8 CJK_9 CJK_10 CJK_11
## 1 3526700000 1872000000 5.3895e+09 1107000000 5.8590e+09 3.4634e+09 7.2585e+09
## 2 1355200000 1775800000 1.5188e+09 648160000 3.5614e+09 2.8647e+09 3.3625e+09
## 3 4867800000 3289200000 7.6909e+09 1779800000 1.0729e+10 7.0765e+09 1.1502e+10
## 4 8381600000 7348900000 1.1474e+10 4003700000 1.3659e+10 1.4943e+10 1.6243e+10
## 5 4752800000 1279200000 7.0850e+09 1080600000 7.1581e+09 3.7303e+09 7.5260e+09
## 6 1012800000 987860000 1.1191e+09 197770000 1.4827e+09 1.1713e+09 1.5283e+09
## CJK_12 CJK_13 CJK_14 CJK_16 CJK_17 CJK_18 CJK_19
## 1 1182700000 2700500000 2.3516e+09 1.7541e+09 1479000000 1.6921e+09 595420000
## 2 1242900000 1354800000 1.5112e+09 9.8431e+08 1251900000 1.8326e+09 292760000
## 3 909280000 4644600000 5.5520e+09 5.2502e+09 4220100000 5.9802e+09 1907800000
## 4 5350900000 5984200000 1.1522e+10 1.5136e+10 4641000000 1.1778e+10 2661700000
## 5 230530000 2489700000 2.3143e+09 2.9472e+09 1776400000 1.6460e+09 941800000
## 6 483000000 813380000 9.7914e+08 8.0994e+08 727020000 8.6971e+08 402110000
## CJK_20
## 1 1666300000
## 2 1066500000
## 3 4852900000
## 4 7224100000
## 5 2054100000
## 6 524550000
NA by genes:
dfNA <- df[,-c(1:3)] %>% is.na %>% apply(1, sum)
dfNA %>%
as.data.frame() %>%
ggplot(aes(.)) +
geom_density() +
theme_minimal()
NA by samples:
dfNA <- df[,-c(1:3)] %>% is.na %>% apply(2, sum)
(dfNA/nrow(df)) %>%
as.data.frame() %>%
rownames_to_column("name") %>%
mutate("col" = name %>% str_detect("BT")) %>%
ggplot(aes(x = ., color = col)) +
geom_density() +
geom_beeswarm(aes(y = 0)) +
theme_minimal()
Pretty normal distributions, but with diffetent modes. It is interesting by itself, and removing samples with big number of NAs will broke normal distribution for normal samples.
I remove all genes with less than 10 hits in samples to get ~log distr.
dfNA <- df[,-c(1:3)] %>% is.na %>% apply(1, sum)
dfNA <- dfNA < 10
cat("Removed", sum(!dfNA),"genes")
## Removed 1788 genes
df <- df[dfNA,]
And re-plot NA distr for samples
dfNA <- df[,-c(1:3)] %>% is.na %>% apply(2, sum)
(dfNA/nrow(df)) %>%
as.data.frame() %>%
rownames_to_column("name") %>%
mutate("col" = name %>% str_detect("BT")) %>%
ggplot(aes(x = ., color = col)) +
geom_density() +
geom_beeswarm(aes(y = 0)) +
theme_minimal() +
scale_color_discrete("WT")
I remove all samples with NA amount > 0.1, to reduce the noise. It is not obligatory, I think, but so much samples)
df <- df[,c(T, T, T, (dfNA/nrow(df))<0.1)]
cat("Keep", sum((dfNA/nrow(df))<0.1), "samples")
## Keep 30 samples
Let’s check sample genes and values distributions
df_long <- df %>%
pivot_longer(cols = -c(1:3), values_to = "val")
ggplot(df_long, aes(log10(val), color = name %>% str_detect("BT"))) +
geom_density() +
theme_minimal() +
facet_wrap(~name) +
scale_color_discrete("WT")
## Warning: Removed 3698 rows containing non-finite outside the scale range
## (`stat_density()`).
Almost all samples have lognormal distributions and high levels, so no will be removed, at least now.
because why not
df_knn <- df[,-c(1:3)] %>%
t %>%
impute.knn(k = 2)
df_knn <- df_knn$data %>% t
df_exp <- colnames(df[,-c(1:3)]) %>% str_detect("BT") %>% as.factor()
df_knn %>%
as.data.frame() %>%
cbind(df$Gene_id) %>%
pivot_longer(cols = -(ncol(df_knn) + 1), values_to = "vals") %>%
ggplot(aes(name, x = log10(vals), colour = name %>% str_detect("BT"))) +
geom_boxplot(outliers = F) +
geom_jitter(size = 0.1, alpha = 0.1) +
theme_minimal() +
scale_color_discrete("WT")
## Warning: Removed 1114 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
OK, move on. All NAs now have level of ~ 10^(4)
df_norm <- (df_knn) %>%
log2 %>%
as.matrix %>%
normalizeQuantiles
df_norm[is.na(df_norm)] <- -1
df_norm[df_norm < 0] <- min(df_norm[df_norm > 0], na.rm = T)
df_norm %>%
as.data.frame() %>%
cbind(df$Gene_id) %>%
pivot_longer(cols = -(ncol(df_knn) + 1), values_to = "vals") %>%
ggplot(aes(name, x = vals, colour = name %>% str_detect("BT"))) +
geom_boxplot(outliers = F) +
geom_violin(fill = "transparent") +
geom_jitter(size = 0.1, alpha = 0.1) +
theme_minimal() +
scale_color_discrete("WT")
Same normal distributions, we can move on. They are a bit asymmetric, which could be reduced by using minmaxscaling or smth like this. It might be crucial for PCA/RDA, but not for limma.
df_rda <- df_norm %>%
t %>%
rda(scale = T, na.action = na.exclude)
ggplot(data.frame(N = df_rda$CA$eig)) +
geom_col(aes(y=N/sum(N), 1:length(N))) +
theme_minimal() +
xlab("PC") + ylab("% explained")
OK, 1st component is enough powerful, we can use it
df_scores <- data.frame(df_norm %>% t,
scores(df_rda, display = "sites",
choices = 1:3, scaling = "sites"))
ggplot(df_scores, aes(x = PC1, y = PC2)) +
geom_point(aes(color = df_exp), alpha = 0.5) +
coord_equal() +
theme_bw() +
scale_color_discrete("WT")
While PC1 indicate differences between experimental conditions, PC2 might indicate batch-effect. In theory, samples could be differentiate also by PC2, but next PC are similar for all samples, so we can move on and try to evaluate it per component
scores(df_rda, display = "sites", scaling = "sites", choices = 1:15) %>%
as.data.frame() %>%
rownames_to_column("type") %>%
pivot_longer(cols = -1, names_to = "PC") %>%
mutate(type = str_detect(type, "BT")) %>%
ggplot(aes(x = PC %>% str_remove_all("PC") %>% fct_inseq, y = value,
color = type)) +
geom_violin(draw_quantiles = 0.5, fill = "transparent") +
geom_beeswarm(size = 0.3) +
theme_bw() +
xlab("PC") +
scale_color_discrete("WT")
In general, disease samples are more variable according to RDA than wt. 2-4 components indicates mainly differences between them.
X1 <- df_norm[,colnames(df_norm) %>% str_detect("BT")]
X2 <- df_norm[,colnames(df_norm) %>% str_detect("BT", negate = T)]
RM <- function(x) rowMeans(x, na.rm = T)
X <- (RM(X1) + RM(X2)) / 2
Y <- (RM(X1) - RM(X2))
# График
ggplot(data.frame(x=X, y = Y), aes(X,Y)) +
geom_point(alpha = 0.5, size = 0.5) +
geom_smooth(method = "lm") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
Looks like ~ normally distributed genes. I don’t know real batch differences, so it might be better to reduce batch based on 2 PC value. But - not now.
rownames(df_norm) <- df$Accession
expr_data <- df_norm %>% as.matrix()
pheno_data <- data.frame(df_exp)
rownames(pheno_data) <- colnames(df_norm)
pheno_metadata <- data.frame(
labelDescription = c("Experimental condition"),
row.names=c("Condition"))
pheno_data <- new("AnnotatedDataFrame",
data = pheno_data,
varMetadata = pheno_metadata)
feature_data <- data.frame(Prot = df$Gene_id)
rownames(feature_data) <- rownames(expr_data)
feature_metadata <- data.frame(
labelDescription = c("Protain name"),
row.names = c("Protain"))
f_data <- new("AnnotatedDataFrame",
data = feature_data,
varMetadata = feature_metadata)
exp_set <-
ExpressionSet(assayData = expr_data,
phenoData = pheno_data,
featureData = f_data)
X <- model.matrix(~ df_exp, pData(exp_set))
fit <- lmFit(exp_set, design = X, method = "robust", maxit = 1000)
efit <- eBayes(fit)
MA_limma <- function(efit, coef, n = 10, signif = TRUE, fdr = 0.05, lfc = 0, text = TRUE, cex.text = 0.8, col.text = "grey20", main = "MA-plot", xlab = "Average log-expression", ylab = "Expression log-ratio", pch = 19, pch.signif = 21, col = "darkgreen", alpha = 0.3, cex = 0.3, ...){
# соотношение и интенсивность
R <- efit$coefficients[, coef]
I <- efit$Amean
# прозрачный цвет
col_btransp <- adjustcolor(col, alpha.f = alpha)
# график
plot(I, R, cex = cex, main = main, pch = pch, xlab = xlab, ylab = ylab, col = col_btransp, ...)
abline(h = 0)
# отмечаем дифференциально-экспрессируемые белки
if(signif){
sign <- p.adjust(efit$p.value[, coef], method = "BH") <= fdr
large <- abs(efit$coefficients[, coef]) >= lfc
points(I[sign & large], R[sign & large], cex = cex*2, col = "orange2", pch = pch.signif)
}
# подписываем первые n белков с сильнее всего различающейся экспрессией
if(text){
ord <- order(efit$lods[, coef], decreasing = TRUE)
top_n <- ord[1:n]
text(I[top_n], R[top_n], labels = efit$genes[top_n, ], pos = 4, cex = cex.text, col = col.text)
}
}
MA_limma(efit, coef = 2, n = 30)
Pretty normal MA plot. I don’t now differentially expressed genes there, but they might be crucial for heart valve calcification. Or just a noise)
my_list <- topTable(efit, coef = 2, n = 100)
dif_exp_set <- exp_set[fData(exp_set)$Prot %in% my_list$Prot, ]
dat <- as.matrix(exprs(dif_exp_set))
rdat <- rownames(dat) %>%
match(df$Accession)
rownames(dat) <- df$Gene_id[rdat]
pal_blue_red <- colorpanel(75, low = "steelblue", mid = "black", high = "red")
heatmap.2(dat, col = pal_blue_red, scale = "row",
key = TRUE, symkey = FALSE,
density.info = "none", trace = "none",
cexRow = 0.9, cexCol = 1,
keysize = 0.7,
margins = c(6,6), key.par = list(mar = c(7,5,5,5)),
lhei = c(0.2,0.6), lwid = c(0.2,0.6)) %>% try(silent = T)
A bit of cell cycle genes (like CDC42, IAH1) are differentially expressed. Meh.
MA_limma(efit, coef = 2, n = 80, text = F, lfc = 1)
topTable(efit, coef = 2)
## Prot logFC AveExpr t P.Value
## P62491|RB11A_HUMAN RAB11A 6.565663 21.61864 73.07138 1.224958e-35
## Q9UK45|LSM7_HUMAN TM9SF4 4.310182 22.25137 29.06216 1.082960e-23
## Q8NI22|MCFD2_HUMAN STX16 -4.824158 21.24154 -27.14160 7.983519e-23
## P15531|NDKA_HUMAN NME1 -4.858458 24.08372 -25.75981 3.649413e-22
## P04080|CYTB_HUMAN GSTM1 -3.713739 24.81541 -24.52886 1.506951e-21
## Q93009|UBP7_HUMAN ANP32B -4.400228 20.86671 -23.68718 4.126489e-21
## P61758|PFD3_HUMAN NMT1 3.975372 22.35427 23.18329 7.660258e-21
## O60888|CUTA_HUMAN STT3B -4.317837 23.85654 -21.78178 4.570176e-20
## O43765|SGTA_HUMAN RPL38 6.831604 24.18559 20.13150 4.284510e-19
## P49756|RBM25_HUMAN SCYL2 -4.289859 22.32136 -19.93083 5.685717e-19
## adj.P.Val B
## P62491|RB11A_HUMAN 2.261272e-32 66.35270
## Q9UK45|LSM7_HUMAN 9.995725e-21 43.85310
## Q8NI22|MCFD2_HUMAN 4.912526e-20 42.00238
## P15531|NDKA_HUMAN 1.684204e-19 40.52858
## P04080|CYTB_HUMAN 5.563662e-19 39.17228
## Q93009|UBP7_HUMAN 1.269583e-18 38.07684
## P61758|PFD3_HUMAN 2.020120e-18 37.58024
## O60888|CUTA_HUMAN 1.054568e-17 35.78400
## O43765|SGTA_HUMAN 8.788006e-17 33.60017
## P49756|RBM25_HUMAN 1.049583e-16 33.32666
numGenes <- nrow(exprs(exp_set))
full_list <- topTable(efit, number = numGenes)
## Removing intercept from test coefficients
full_list <- full_list[full_list$adj.P.Val <= 0.05,]
my_list <- full_list[1:20,]
dif_exp_set <- exp_set[fData(exp_set)$Prot %in% my_list$Prot, ]
ggplot() +
geom_point(data = full_list,
aes(logFC, -log10(adj.P.Val),
color = (abs(logFC) > 1) & (adj.P.Val < .01) ), show.legend = F) +
geom_text_repel(data = my_list,
aes(logFC, -log10(adj.P.Val),
label = Prot)) +
theme_minimal() +
ylab("-log10 of adjusted p value") +
scale_color_viridis_d(direction = -1, begin = .3)
Cute volcano-plot)
# Significant results
sign_results <- full_list %>%
filter(adj.P.Val < .05)
rownames(sign_results) <- rownames(sign_results) %>% str_remove_all("\\|.*")
sign_results$Prot <- sign_results %>% rownames
# Filter directions
sign_up <- sign_results %>% filter(logFC > 0)
sign_dw <- sign_results %>% filter(logFC < 0)
gene_up <- sign_up %>% rownames
gene_dw <- sign_dw %>% rownames
cat("Up-regulated in wt:", length(gene_up), "genes")
## Up-regulated in wt: 572 genes
cat("Down-regulated in wt:", length(gene_dw), "genes")
## Down-regulated in wt: 551 genes
Similar amount of genes are up- and down- regulated
KEGG_enrich_dw <- enrichKEGG(gene_dw, organism = "hsa", keyType = "uniprot",
pAdjustMethod = "BH")
## Reading KEGG annotation online: "https://rest.kegg.jp/link/hsa/pathway"...
## Reading KEGG annotation online: "https://rest.kegg.jp/list/pathway/hsa"...
## Reading KEGG annotation online: "https://rest.kegg.jp/conv/uniprot/hsa"...
KEGG_enrich_up <- enrichKEGG(gene_up, organism = "hsa", keyType = "uniprot",
pAdjustMethod = "BH")
enrich <-
KEGG_enrich_dw@result %>%
merge(data.frame(type = "dw")) %>% merge("ENSEMBL") %>%
rbind(
KEGG_enrich_up@result %>%
merge(data.frame(type = "up")) %>% merge("ENSEMBL"))
enrich$Description <- enrich$Description %>%
fct_reorder(enrich$Count)
gg <- ggplot(enrich, aes(y = Description, x = Count,
color = -log10(p.adjust))) +
geom_point() +
facet_grid(~type) +
theme_minimal() +
xlab("counts") + ylab ("KEGG ID") +
theme(axis.text.y = element_blank()) +
scale_color_viridis_c(direction = -1)
ggplotly(gg)
dotplot(KEGG_enrich_dw)
dotplot(KEGG_enrich_up)
cnetplot(KEGG_enrich_dw)
## Warning: ggrepel: 1 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
cnetplot(KEGG_enrich_up)
## Warning: ggrepel: 13 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
GO_enrich_dw_CC <- enrichGO(sign_dw %>% rownames, "org.Hs.eg.db", keyType = "UNIPROT", ont = "CC")
GO_enrich_up_CC <- enrichGO(sign_up %>% rownames, "org.Hs.eg.db", keyType = "UNIPROT", ont = "CC")
dotplot(GO_enrich_dw_CC, showCategory = 10)
dotplot(GO_enrich_up_CC, showCategory = 10)
pairwise_termsim(GO_enrich_dw_CC) %>% goplot(showCategory = 2)
pairwise_termsim(GO_enrich_up_CC) %>% goplot()
GO_enrich_dw_MF <- enrichGO(sign_dw %>% rownames, "org.Hs.eg.db", keyType = "UNIPROT", ont = "MF")
GO_enrich_up_MF <- enrichGO(sign_up %>% rownames, "org.Hs.eg.db", keyType = "UNIPROT", ont = "MF")
dotplot(GO_enrich_dw_MF, showCategory = 10)
dotplot(GO_enrich_up_MF, showCategory = 10)
pairwise_termsim(GO_enrich_dw_MF) %>% goplot()
pairwise_termsim(GO_enrich_up_MF) %>% goplot()
## Warning: ggrepel: 4 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
GO_enrich_dw_BP <- enrichGO(sign_dw %>% rownames, "org.Hs.eg.db", keyType = "UNIPROT", ont = "BP")
GO_enrich_up_BP <- enrichGO(sign_up %>% rownames, "org.Hs.eg.db", keyType = "UNIPROT", ont = "BP")
dotplot(GO_enrich_dw_BP, showCategory = 10)
Similarly to GO by MF, diffrent transcription and RNA processing regulators are upregulated according to BP in wt samples
dotplot(GO_enrich_up_BP, showCategory = 10)
A lot of categories are down-regulated and up-regulate at the same time. So it is obligatory to run GSEA and grouped GO
pairwise_termsim(GO_enrich_dw_BP) %>% goplot(showCategory = 2)
MF GOplots are unreadable - almost always.
pairwise_termsim(GO_enrich_up_BP)%>% goplot(showCategory = 3)
## Warning: ggrepel: 3 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
ENM <- function(x0, x, ont, rbd = T){
x <- x@result %>%
mutate(ONTOLOGY = ont)
if(rbd) x <- rbind(x0, x)
return(x)
}
GO_enrich <-
ENM(x = GO_enrich_dw_BP, ont = "BP", rbd = F) %>%
ENM(GO_enrich_dw_CC, "CC") %>%
ENM(GO_enrich_dw_MF, "MF") %>%
ENM(GO_enrich_up_BP, "BP") %>%
ENM(GO_enrich_up_CC, "CC") %>%
ENM(GO_enrich_up_MF, "MF")
GO_enrich <- data.frame(Category = GO_enrich$ONTOLOGY,
ID = GO_enrich$ID,
Term = GO_enrich$Description,
Genes = GO_enrich$geneID %>%
str_replace_all("\\/", ", "),
adj_pval = GO_enrich$p.adjust)
GO_prep <- data.frame(ID = sign_results %>% rownames,
logFC = sign_results$logFC)
circ <- circle_dat(GO_enrich, GO_prep)
circGO <- circ[-which(duplicated(circ$ID)),]
circGO$term <- fct_reorder(circGO$term, circGO$zscore)
MF3 <- groupGO(sign_dw %>% rownames, OrgDb = "org.Hs.eg.db",
keyType = "UNIPROT", level = 3, ont = "MF")@result$ID
BP3 <- groupGO(sign_dw %>% rownames, OrgDb = "org.Hs.eg.db",
keyType = "UNIPROT", level = 3, ont = "BP")@result$ID
CC3 <- groupGO(sign_dw %>% rownames, OrgDb = "org.Hs.eg.db",
keyType = "UNIPROT", level = 3, ont = "CC")@result$ID
gg <-
circGO %>%
subset(category == "BP") %>%
subset(count > 5) %>%
subset(adj_pval < 0.01) %>%
ggplot () +
geom_col (mapping = aes(y = term, x = zscore, fill = adj_pval)) +
ylab ("") +
theme_minimal() +
theme(axis.text.y = element_blank(),
panel.grid.major.y = element_blank(),
panel.grid.minor = element_blank(),
panel.grid.major.x = element_line()) +
scale_fill_gradient("Adjusted p-value",
low = "darkblue", high = "cadetblue1")
ggplotly(gg)
gg <-
circGO %>%
subset(category == "MF") %>%
subset(count > 5) %>%
subset(adj_pval < 0.05) %>%
ggplot () +
geom_col (mapping = aes(y = term, x = zscore, fill = adj_pval)) +
ylab ("") +
theme_minimal() +
theme(axis.text.y = element_blank(),
panel.grid.major.y = element_blank(),
panel.grid.minor = element_blank(),
panel.grid.major.x = element_line()) +
scale_fill_gradient("Adjusted p-value",
low = "darkblue", high = "cadetblue1")
ggplotly(gg)
Full plot:
gg <-
circGO %>%
subset(ID %in% c(MF3, BP3, CC3)) %>%
ggplot () +
geom_col (mapping = aes(y = term, x = zscore, fill = adj_pval)) +
ylab ("") +
theme_minimal() +
theme(axis.text.y = element_text(size = 5),
panel.grid.major.y = element_blank(),
panel.grid.minor = element_blank(),
panel.grid.major.x = element_line(),
legend.position = "bottom",
strip.text.y = element_text(angle = 0)) +
facet_grid(category~., scales = "free_y", space = "free") +
scale_fill_gradient("Adjusted p-value",
low = "darkblue", high = "cadetblue1")
ggplotly(gg)
ggplotly breaks a part of aesthetics, but what else)
circ %>%
subset(ID %in% c(MF3, BP3, CC3)) %>%
GOBubble(display = F)
pathways <- gmtPathways("wikipathways-20240410-gmt-Homo_sapiens.gmt")
names(pathways) <- names(pathways) %>%
str_remove_all("\\%W.*")
nr <- sign_results %>%
rownames %>%
bitr(fromType = "UNIPROT", toType = "ENTREZID", OrgDb = org.Hs.eg.db) %>%
subset(!duplicated("ENTREZID"))
## 'select()' returned 1:many mapping between keys and columns
## Warning in bitr(., fromType = "UNIPROT", toType = "ENTREZID", OrgDb =
## org.Hs.eg.db): 0.53% of input gene IDs are fail to map...
nr <- sign_results %>%
rownames_to_column("UNIPROT") %>%
left_join(nr)
## Joining with `by = join_by(UNIPROT)`
NR <- c(nr$logFC)
names(NR) <- nr$ENTREZID
fgsea_results <- fgseaMultilevel(pathways, NR)
## Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (3.13% of the list).
## The order of those tied genes will be arbitrary, which may produce unexpected results.
## Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize,
## gseaParam, : There are duplicate gene names, fgsea may produce unexpected
## results.
head(fgsea_results[order(pval), ])
## pathway pval padj
## <char> <num> <num>
## 1: Myometrial relaxation and contraction pathways 7.225501e-08 4.877214e-05
## 2: Calcium regulation in cardiac cells 1.414574e-04 4.774187e-02
## 3: Cell cycle 9.029024e-04 1.840715e-01
## 4: Prostaglandin synthesis and regulation 1.236957e-03 1.840715e-01
## 5: Ferroptosis 1.363493e-03 1.840715e-01
## 6: Oxidative stress response 1.904527e-03 1.840892e-01
## log2err ES NES size leadingEdge
## <num> <num> <num> <int> <list>
## 1: 0.7049757 0.7096672 2.719837 23 58, 2783....
## 2: 0.5188481 0.5803458 2.169119 22 2783, 27....
## 3: 0.4772708 0.7774974 1.939561 7 7532, 75....
## 4: 0.4550599 -0.7289878 -1.951763 9 6277, 53....
## 5: 0.4550599 -0.6566857 -1.919333 12 1803, 21....
## 6: 0.4550599 -0.7729638 -1.919481 7 6648, 17....
topPathwaysUp <- fgsea_results[ES > 0][head(order(pval), n=10), pathway]
topPathwaysDown <- fgsea_results[ES < 0][head(order(pval), n=10), pathway]
topPathways <- c(topPathwaysUp, rev(topPathwaysDown))
plotGseaTable(pathways[topPathways], NR, fgsea_results,
gseaParam=0.5)
GO_gsea <- gseGO(NR[order(NR, decreasing = T)],
ont = "ALL", "org.Hs.eg.db", eps = 0)
## preparing geneSet collections...
## GSEA analysis...
## Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (3.13% of the list).
## The order of those tied genes will be arbitrary, which may produce unexpected results.
## Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize,
## gseaParam, : There are duplicate gene names, fgsea may produce unexpected
## results.
## leading edge analysis...
## done...
ridgeplot(GO_gsea)
## Picking joint bandwidth of 0.339
The basic GO gives similar results for both lines. Cellular localization and biological processes do not differ in general. A rare difference is the down-regulation of aerobic respiratory processes compared to the control. But the group GO plots already show clear differences.
Only a small group of GO-determining genes appears to be differentially up-regulated significantly. These are genes responsible for myosin binding - probably because they compared specifically cardiac tissue of sick people, with the normal level, where myogenesis processes are less intensified. Similarly, up-regulated adrenergic receptor genes may be a consequence of treatment.
On the other hand, the number of down-regulated genes determining GO compared to controls is high. These are genes responsible for different stages of gene expression, metabolic activity, aerobic respiration and many other processes, judging by all simply are indicators of the deplorable state of the tissue.
GSEA shows that normal processes in cardiac tissue are down-regulated compared to controls. The cell cycle is slower. The cytoskeleton, including the muscle-specific cytoskeleton, works worse. On the contrary, fatty acid transporters, apparently activated in connective tissue cells, and ferroptosis processes are activated. There is an active response to oxidative stress, the work of cyclooxygenase pathway.
It all looks like symptoms of a heart attack or some such oxygen deprivation coupled with hemorrhaging. The cause must be in there somewhere).